# Computations
import pandas as pd
import numpy as np
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## WordCloud
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
In this article, we work on a dataset available from the UCI Machine Learning Repository. The data is related to direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe to a term deposit (variable y).
This dataset is based on "Bank Marketing" UCI dataset (please check the description at archive.ics.uci.edu/ml/datasets/Bank+Marketing). The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: bportugal.pt/estatisticasweb. This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns).
The data is related to the direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.
There are four datasets:
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
The zip file includes two datasets:
The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).
Data = pd.read_csv('Data/Bank_mod.csv')
display(Data.head().round(2))
| Age | Job | Marital | Education | Default | Housing | Loan | Contact | Month | Day Of Week | ... | Campaign | Pdays | Previous | Poutcome | Employment Variation Rate | Consumer Price Index | Consumer Confidence Index | Euribor three Month Rate | Number of Employees | Term Deposit Subscription | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | Housemaid | Married | Basic.4Y | No | No | No | Telephone | May | Monday | ... | 1 | 999 | 0 | Nonexistent | 1.1 | 93.99 | -36.4 | 4.86 | 5191.0 | No |
| 1 | 57 | Services | Married | High.School | Unknown | No | No | Telephone | May | Monday | ... | 1 | 999 | 0 | Nonexistent | 1.1 | 93.99 | -36.4 | 4.86 | 5191.0 | No |
| 2 | 37 | Services | Married | High.School | No | Yes | No | Telephone | May | Monday | ... | 1 | 999 | 0 | Nonexistent | 1.1 | 93.99 | -36.4 | 4.86 | 5191.0 | No |
| 3 | 40 | Admin. | Married | Basic.6Y | No | No | No | Telephone | May | Monday | ... | 1 | 999 | 0 | Nonexistent | 1.1 | 93.99 | -36.4 | 4.86 | 5191.0 | No |
| 4 | 56 | Services | Married | High.School | No | No | Yes | Telephone | May | Monday | ... | 1 | 999 | 0 | Nonexistent | 1.1 | 93.99 | -36.4 | 4.86 | 5191.0 | No |
5 rows × 21 columns
| Number of Instances | Number of Attributes |
|---|---|
| 41188 | 21 |
| Feature | Description |
|---|---|
| Age | numeric |
| Job | Type of Job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown") |
| Marital | marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed) |
| Education | (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown") |
| Default | has credit in default? (categorical: "no","yes","unknown") |
| Housing | has housing loan? (categorical: "no","yes","unknown") |
| Loan | has personal loan? (categorical: "no","yes","unknown") |
| Feature | Description |
|---|---|
| Contact | contact communication type (categorical: "cellular","telephone") |
| Month | last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") |
| Day of week | last contact day of the week (categorical: "mon","tue","wed","thu","fri") |
| Duration | last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. |
| Feature | Description |
|---|---|
| Campaign | number of contacts performed during this campaign and for this client (numeric, includes last contact) |
| Pdays | number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) |
| Previous | number of contacts performed before this campaign and for this client (numeric) |
| Poutcome | outcome of the previous marketing campaign (categorical: "failure","nonexistent","success") |
| Feature | Description |
|---|---|
| Employment Variation Rate | employment variation rate - quarterly indicator (numeric) |
| Consumer Price Index | consumer price index - monthly indicator (numeric) |
| Consumer Confidence Index | consumer confidence index - monthly indicator (numeric) |
| Euribor three Month Rate | euribor* 3 month rate - daily indicator (numeric) |
| Number of Employees | number of employees - quarterly indicator (numeric) |
* the basic rate of interest used in lending between banks on the European Union interbank market and also used as a reference for setting the interest rate on other loans.
| Feature | Description |
|---|---|
| Term Deposit Subscription | has the client Term Deposit Subscription? (binary: "yes","no") |
Dataset_Subcategories = {}
Dataset_Subcategories['Bank Client Data'] = Data.iloc[:,:7].columns.tolist()
Dataset_Subcategories['Related with the Last Contact of the Current Campaign'] = Data.iloc[:,7:11].columns.tolist()
Dataset_Subcategories['Other Attributes'] = Data.iloc[:,11:15].columns.tolist()
Dataset_Subcategories['Social and Economic Context Attributes'] = Data.iloc[:,15:-1].columns.tolist()
Dataset_Subcategories['Output variable (Desired Target)'] = Data.iloc[:,-1:].columns.tolist()
Creating new features:
We can create Age Categories using statcan.gc.ca.
| Interval | Age Category |
|---|---|
| 00-14 years | Children |
| 15-24 years | Youth |
| 25-64 years | Adults |
| 65 years and over | Seniors |
if Data.Age.min() < 14:
bins = pd.IntervalIndex.from_tuples([(0, 14), (14, 24), (24, 64),(64, 100)])
else:
bins = pd.IntervalIndex.from_tuples([(14, 24), (24, 64),(64, 100)])
Data['Age Group'] = pd.cut(Data['Age'], bins)
Data['Age Category'] = Data['Age Group'].astype(str).replace({'(14, 24]':'Youth', '(24, 64]':'Adults','(64, 100]':'Seniors'})
def Title_Fig(text, CM = "Blues", title = None, FG = (8,6)):
fig, ax = plt.subplots(1, 1, figsize= FG)
wordcloud = WordCloud(colormap = CM, background_color="white").generate(text)
_ = ax.imshow(wordcloud)
if title != None:
_ = ax.set_title(title, fontsize=20)
_ = ax.axis('off')
Title_Fig(" ".join(Dataset_Subcategories['Bank Client Data']), CM = "PuBu", title = 'Bank Client Data')
def Dist_Plot(Feat, YR = 2e3):
Colors = ['LightSalmon', 'LightBlue']
LC = 'Black'
fig = px.histogram(Data, x = Feat, color='Term Deposit Subscription', marginal= 'box',
color_discrete_sequence= Colors)
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig['layout']['yaxis'].update(range=[0, YR])
fig.update_layout(plot_bgcolor= 'white',
title={'text': '%s Distribution by Whether a Client Term Deposit Subscription' % Feat,
'x':0.46, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency')
fig.show()
Dist_Plot(Feat = 'Age', YR = 2e3)
def Bank_Client_Data_Plot(Feat, FigH = 500, VS = 0.08, title_y = 0.90, XR = 100, TP = 'outside'):
Colors = ['PaleTurquoise', 'LightGreen']
LC = 'Black'
Cols = [Feat, 'Marital','Term Deposit Subscription']
Temp = Data[Cols]
Temp = Temp.groupby(Cols)['Term Deposit Subscription'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
# Figures
fig = make_subplots(rows=2, cols=1, vertical_spacing = VS, shared_xaxes=True,
subplot_titles=('Term Deposit Subscription: No', 'Term Deposit Subscription: Yes'))
# Top
fig1 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'No'], y= Feat, x= 'Percentage', orientation='h',
color = 'Marital', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = 'Marital', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_layout(height= FigH)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_traces(texttemplate='%{text:.2}%', textposition= TP)
fig.update_xaxes(title_text='Percent', range=[0, XR], row=2, col=1)
fig.update_yaxes(title_text=Feat, row=1, col=1)
fig.update_yaxes(title_text=Feat, row=2, col=1)
fig.update_layout(title={'text': '%s by Marital Status and Term Deposit Subscription' % Feat,
'x':0.50, 'y': title_y, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Bank_Client_Data_Plot(Feat = 'Age Category', XR = 60)
Title_Fig(" ".join(Data['Job'].unique()), CM = "Greens", title = 'Job Titles')
Bank_Client_Data_Plot('Job', FigH = 1000, VS = 0.03, title_y = 0.94, XR = 20)
Title_Fig(" ".join(Data['Education'].unique()), CM = "Purples", title = 'Education')
Bank_Client_Data_Plot(Feat = 'Education', FigH = 900, VS = 0.03, title_y = 0.93, XR = 16, TP = 'outside')
Bank_Client_Data_Plot(Feat = 'Default', FigH = 450, VS = 0.08, title_y = 0.88, XR = 50, TP = 'outside')
Bank_Client_Data_Plot(Feat = 'Housing', FigH = 550, VS = 0.06, title_y = 0.9, XR = 30)
Bank_Client_Data_Plot(Feat = 'Loan', FigH = 500, VS = 0.06, title_y = 0.89, XR = 50)
Title_Fig(" ".join(Dataset_Subcategories['Related with the Last Contact of the Current Campaign']), CM = "PiYG",
title = 'Related with the Last Contact of the Current Campaign')
df = Data.copy()
bins = pd.IntervalIndex.from_tuples([(-1, 100), (100, 200), (200, 400),(400, 800), (800, 5000)])
df['Duration'] = pd.cut(df['Duration'], bins).astype(str)
def Contact_Features_Plot(Feat, FigH = 500, VS = 0.08, title_y = 0.90, XR = 100, TP = 'outside'):
Colors = ['Violet', 'BlueViolet']
LC = 'Indigo'
Cols = [Feat, 'Contact','Term Deposit Subscription']
Temp = df[Cols]
Temp = Temp.groupby(Cols)['Term Deposit Subscription'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
# Figures
fig = make_subplots(rows=2, cols=1, vertical_spacing = VS, shared_xaxes=True,
subplot_titles=('Term Deposit Subscription: No', 'Term Deposit Subscription: Yes'))
# Top
fig1 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'No'], y= Feat, x= 'Percentage', orientation='h',
color = 'Contact', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = 'Contact', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_layout(height= FigH)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition= TP)
fig.update_xaxes(title_text='Percent', range=[0, XR], row=2, col=1)
fig.update_yaxes(title_text=Feat, row=1, col=1)
fig.update_yaxes(title_text=Feat, row=2, col=1)
fig.update_layout(title={'text': '%s by Marital Status and Term Deposit Subscription' % Feat,
'x':0.50, 'y': title_y, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Contact_Features_Plot(Feat= 'Duration', FigH = 600, VS = 0.06, title_y = 0.90, XR = 20)
Title_Fig(" ".join(Data['Month'].unique()), CM = "Spectral",
title = 'Months')
Contact_Features_Plot(Feat= 'Month', FigH = 1000, VS = 0.04, title_y = 0.94, XR = 25)
Title_Fig(" ".join(Data['Day Of Week'].unique()), CM = "PuOr", title = 'Day Of Week')
Contact_Features_Plot(Feat= 'Day Of Week', FigH = 650, VS = 0.04, title_y = 0.92, XR = 14)
Title_Fig(" ".join(Dataset_Subcategories['Other Attributes']), CM = "winter", title = 'Other Attributes')
The number of contacts performed during this campaign and for this client (numeric, includes the last contact)
Colors = ['LightCoral', 'PaleTurquoise', 'LightGreen']
LC = 'Black'
Cols = ['Campaign', 'Term Deposit Subscription', 'Poutcome']
Temp = Data[Cols]
bins = pd.IntervalIndex.from_tuples([(0, 5), (5, 10), (10, 20),(20, 45), (45, 60)])
Temp['Campaign'] = pd.cut(Temp['Campaign'], bins).astype(str)
Temp = Temp.groupby(Cols)['Term Deposit Subscription'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Campaign','Poutcome'], inplace = True)
# Figures
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.06, shared_xaxes=True,
subplot_titles=('Term Deposit Subscription: No', 'Term Deposit Subscription: Yes'))
# Top
fig1 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'No'], y= 'Campaign', x= 'Percentage',
orientation='h', color = 'Poutcome', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.add_trace(fig1['data'][2], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'Yes'], y= 'Campaign', x= 'Percentage', orientation='h',
color = 'Poutcome', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.add_trace(fig2['data'][2], row=2, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_layout(height= 700)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition= 'outside')
fig.update_xaxes(title_text='Percent', range=[0, 100], row=2, col=1)
fig.update_yaxes(title_text='The number of contacts performed', row=1, col=1)
fig.update_yaxes(title_text='The number of contacts performed', row=2, col=1)
fig.update_layout(title={'text': 'The number of contacts performed<br>by the outcome of the previous marketing and term deposit subscription',
'x':0.50, 'y': 0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Colors = ['PaleTurquoise', 'LightCoral', 'LightGreen']
LC = 'Black'
Cols = ['Previous', 'Poutcome', 'Term Deposit Subscription']
Temp = Data[Cols]
Temp = Temp.groupby(Cols)['Term Deposit Subscription'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp[Temp.Percentage != 0]
Temp.sort_values(by=['Previous','Poutcome'], inplace = True)
# Figures
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.04, shared_xaxes=True,
subplot_titles=('Term Deposit Subscription: No', 'Term Deposit Subscription: Yes'))
# Top
fig1 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'No'], y= 'Previous', x= 'Percentage', orientation='h',
color = 'Poutcome', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.add_trace(fig1['data'][2], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(Temp.loc[Temp['Term Deposit Subscription'] == 'Yes'], y= 'Previous', x= 'Percentage', orientation='h',
color = 'Poutcome', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = Colors)
fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.add_trace(fig2['data'][2], row=2, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_layout(height= 900)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition= 'outside')
fig.update_xaxes(title_text='Percent', range=[0, 90], row=2, col=1)
fig.update_yaxes(title_text='Previous', row=1, col=1)
fig.update_yaxes(title_text='The number of contacts performed before this campaign', row=2, col=1)
fig.update_layout(title={'text': 'The number of contacts performed before this campaign<br>by the outcome of the previous marketing and term deposit subscription',
'x':0.50, 'y': 0.95, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014
S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimaraes, Portugal, October, 2011. EUROSIS. [bank.zip]